This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/prepped_data"
output_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/prepped_data"
output_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/prepped_data"
output_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/prepped_data"
output_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/prepped_data"
output_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/prepped_data"
output_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/prepped_data"
output_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/prepped_data"
output_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/prepped_data"
output_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/prepped_data"
output_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_clinical_only_generic'
retrain = True
# Parameters
input_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/prepped_data"
output_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/LR_clinical"
retrain = True
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
# infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
# X = pk.load(infile_X)
# infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = meta_data_mat
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# model definition
clf = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='l2', solver='liblinear')
param_grid = dict(C=np.exp(-np.arange(-10, 10)))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.709302 | train |
| 1 | 0.755043 | train |
| 2 | 0.738889 | train |
| 3 | 0.719298 | train |
| 4 | 0.713864 | train |
| 5 | 0.713864 | train |
| 6 | 0.735955 | train |
| 7 | 0.713450 | train |
| 8 | 0.721408 | train |
| 9 | 0.699422 | train |
| 0 | 0.769231 | test |
| 1 | 0.590909 | test |
| 2 | 0.640000 | test |
| 3 | 0.731183 | test |
| 4 | 0.750000 | test |
| 5 | 0.750000 | test |
| 6 | 0.658228 | test |
| 7 | 0.752688 | test |
| 8 | 0.723404 | test |
| 9 | 0.808989 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['C'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | |
|---|---|
| 0 | 22026.465795 |
| 1 | 22026.465795 |
| 2 | 22026.465795 |
| 3 | 22026.465795 |
| 4 | 22026.465795 |
| 5 | 22026.465795 |
| 6 | 22026.465795 |
| 7 | 22026.465795 |
| 8 | 22026.465795 |
| 9 | 22026.465795 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.544596 | 0.517155 | 0.064890 |
| 1 | AUPRC_POS | 0.668019 | 0.693702 | 0.062977 |
| 2 | AUROC_NEG | 0.657184 | 0.655406 | 0.060417 |
| 3 | AUROC_POS | 0.657184 | 0.655406 | 0.060417 |
| 4 | f1_score | 0.717463 | 0.740591 | 0.063375 |
| 5 | npv_score | 0.622592 | 0.603571 | 0.078652 |
| 6 | ppv_score | 0.698990 | 0.721514 | 0.064877 |
| 7 | sensitivity | 0.737501 | 0.755556 | 0.065058 |
| 8 | specificity | 0.576867 | 0.586207 | 0.073382 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))